{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 03 测试我们的算法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn import datasets "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])"
     },
     "metadata": {},
     "execution_count": 2
    }
   ],
   "source": [
    "iris = datasets.load_iris()\n",
    "iris.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = iris.data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y = iris.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "(150, 4)"
     },
     "metadata": {},
     "execution_count": 5
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "(150,)"
     },
     "metadata": {},
     "execution_count": 6
    }
   ],
   "source": [
    "y.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### train_test_split\n",
    "\n",
    "分离出一部分数据做训练，另外一部分数据做测试。\n",
    "不能简单的使用切片对X和y进行训练和测试分离，因为看看y的数据就知道元数据可能是有顺序的。所以最好进行乱序化处理\n",
    "但是X和y是对应的，不能各自乱序化\n",
    "可以合并起来进行乱序化\n",
    "或者直接获得乱序化的索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])"
     },
     "metadata": {},
     "execution_count": 7
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "array([ 12, 106,  60,  10, 131, 100, 148,  82,  52,  91,  56,   1,  41,\n        30, 147,  73,  57, 144,  75,   6, 128,  96,   2,  55,  25,   4,\n       113,  50,  22, 115,  20,   7, 141, 126,  21, 124, 139,  14, 146,\n        18,  37, 140,  44, 125,  72,  48,  65, 102,   0,  31,  89,   9,\n       101, 105,  70,  58,  95,  35,  15,  29, 107,  32,  78, 145, 116,\n       136,  90,  36,  63,  61, 137, 130,  17,  26, 129,  33, 104,  97,\n       120,  87,  85,  27, 123,  94,  67,  83,  16,   5,  93,  69,  99,\n       114,  47,  42, 138, 108, 135, 112,  76, 118, 110,  38,  24, 111,\n       149, 122,  64,  66,  54, 134, 142, 103,  74, 117,  77,  59,  51,\n       121,  28,  43,  11,  45,  84,  68,  62,  92, 127,  53,  71,  81,\n        49,  46, 109,   3,  19, 143,   8, 119,  98,  88, 132,  23,  34,\n        79,  80,  86,  39, 133,  40,  13])"
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "source": [
    "# np.random.permutation是对上限以内的数字进行乱序排序\n",
    "shuffled_indexes = np.random.permutation(len(X))\n",
    "shuffled_indexes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 测试数据集个数\n",
    "test_ratio = 0.2\n",
    "# 去掉小数\n",
    "test_size = int(len(X) * test_ratio)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_indexes = shuffled_indexes[:test_size]\n",
    "train_indexes = shuffled_indexes[test_size:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train = X[train_indexes]\n",
    "y_train = y[train_indexes]\n",
    "\n",
    "X_test = X[test_indexes]\n",
    "y_test = y[test_indexes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "(120, 4)\n(120,)\n"
    }
   ],
   "source": [
    "print(X_train.shape)\n",
    "print(y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "(30, 4)\n(30,)\n"
    }
   ],
   "source": [
    "print(X_test.shape)\n",
    "print(y_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 封装"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from playML.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "(120, 4)\n(120,)\n"
    }
   ],
   "source": [
    "print(X_train.shape)\n",
    "print(y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "(30, 4)\n(30,)\n"
    }
   ],
   "source": [
    "print(X_test.shape)\n",
    "print(y_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 测试我们的算法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from playML.kNN import KNNClassifier\n",
    "\n",
    "my_knn_clf = KNNClassifier(k=3)\n",
    "my_knn_clf.fit(X_train, y_train)\n",
    "y_predict = my_knn_clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "array([1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 2, 0, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1,\n       2, 0, 0, 1, 1, 2, 2, 0])"
     },
     "metadata": {},
     "execution_count": 18
    }
   ],
   "source": [
    "y_predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "array([1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 2, 0, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1,\n       2, 0, 0, 1, 2, 2, 2, 0])"
     },
     "metadata": {},
     "execution_count": 19
    }
   ],
   "source": [
    "y_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "29"
     },
     "metadata": {},
     "execution_count": 20
    }
   ],
   "source": [
    "# 看一下预测准确率，也就是y_predict和测试集相等的元素个数\n",
    "sum(y_predict == y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.9666666666666667"
     },
     "metadata": {},
     "execution_count": 21
    }
   ],
   "source": [
    "# 算出预测准确率\n",
    "sum(y_predict == y_test) / len(y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### sklearn中的train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<function sklearn.model_selection._split.train_test_split(*arrays, **options)>"
     },
     "metadata": {},
     "execution_count": 22
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "(120, 4)\n(120,)\n"
    }
   ],
   "source": [
    "print(X_train.shape)\n",
    "print(y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "(30, 4)\n(30,)\n"
    }
   ],
   "source": [
    "print(X_test.shape)\n",
    "print(y_test.shape)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6.8 64-bit ('anaconda': conda)",
   "language": "python",
   "name": "python36864bitanacondaconda085db2e8b14649f4b32196068f7124e9"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}